Load Library

library(tidyverse)
library(tidyr)
library(tidymodels)
library(tidytext)
library(janitor)
library(skimr)
library(vip)
library(parallel)
library(doParallel)
library(rpart.plot)
library(textrecipes)
library(stringi)
library(xgboost)
library(DALEXtra)
library(DALEX)
library(MASS)
library(solitude)
library(rpart.plot)
library(ggpubr)

Import Data

loan <- read_csv("loan_train.csv", na = c("NA", "", "-")) %>% 
  mutate(int_rate = as.numeric(str_replace(int_rate, "%", "")),
         revol_util = as.numeric(str_replace(revol_util, "%", "")),
         empl_year = word(emp_length)) %>% clean_names()
## Rows: 29777 Columns: 52
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (23): term, int_rate, grade, sub_grade, emp_title, emp_length, home_owne...
## dbl (29): id, member_id, loan_amnt, funded_amnt, funded_amnt_inv, installmen...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
skim(loan)
Data summary
Name loan
Number of rows 29777
Number of columns 53
_______________________
Column type frequency:
character 22
numeric 31
________________________
Group variables None

Variable type: character

skim_variable n_missing complete_rate min max empty n_unique whitespace
term 3 1.00 9 9 0 2 0
grade 3 1.00 1 1 0 7 0
sub_grade 3 1.00 2 2 0 35 0
emp_title 1817 0.94 2 75 0 22143 0
emp_length 3 1.00 3 9 0 12 0
home_ownership 3 1.00 3 8 0 5 0
verification_status 3 1.00 8 15 0 3 0
issue_d 3 1.00 8 8 0 55 0
loan_status 0 1.00 7 7 0 2 0
pymnt_plan 3 1.00 1 1 0 2 0
url 3 1.00 62 64 0 29774 0
desc 9432 0.68 1 3943 0 20310 0
purpose 3 1.00 3 18 0 14 0
title 13 1.00 2 80 0 15200 0
zip_code 3 1.00 5 5 0 819 0
addr_state 3 1.00 2 2 0 50 0
earliest_cr_line 23 1.00 8 8 0 516 0
last_pymnt_d 67 1.00 8 8 0 106 0
next_pymnt_d 27425 0.08 8 8 0 96 0
last_credit_pull_d 5 1.00 8 8 0 109 0
application_type 3 1.00 10 10 0 1 0
empl_year 3 1.00 1 3 0 12 0

Variable type: numeric

skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist
id 3 1.00 663006.18 219881.52 54734.00 496344.25 642763.00 825647.50 1077501.00 ▁▃▇▆▅
member_id 3 1.00 823568.15 280420.33 70473.00 635380.00 822247.50 1033656.25 1314167.00 ▁▃▇▇▆
loan_amnt 3 1.00 11109.43 7404.65 500.00 5225.00 9800.00 15000.00 35000.00 ▇▇▃▂▁
funded_amnt 3 1.00 10843.64 7147.05 500.00 5100.00 9600.00 15000.00 35000.00 ▇▇▃▂▁
funded_amnt_inv 3 1.00 10149.66 7130.86 0.00 4950.00 8500.00 14000.00 35000.00 ▇▆▃▁▁
int_rate 3 1.00 12.17 3.72 5.42 9.63 11.99 14.72 24.11 ▆▇▇▂▁
installment 3 1.00 323.81 209.77 15.67 165.84 278.94 429.86 1305.19 ▇▆▂▁▁
annual_inc 4 1.00 69201.23 66566.42 2000.00 40000.00 59000.00 82500.00 6000000.00 ▇▁▁▁▁
dti 3 1.00 13.38 6.74 0.00 8.19 13.49 18.70 29.99 ▅▇▇▆▂
delinq_2yrs 23 1.00 0.16 0.52 0.00 0.00 0.00 0.00 13.00 ▇▁▁▁▁
fico_range_low 3 1.00 713.05 36.31 610.00 685.00 710.00 740.00 825.00 ▁▇▇▅▁
fico_range_high 3 1.00 717.05 36.31 614.00 689.00 714.00 744.00 829.00 ▁▇▇▅▁
inq_last_6mths 23 1.00 1.08 1.54 0.00 0.00 1.00 2.00 33.00 ▇▁▁▁▁
mths_since_last_delinq 18907 0.37 34.72 22.32 0.00 17.00 32.00 51.00 120.00 ▇▇▅▂▁
mths_since_last_record 27208 0.09 59.23 47.22 0.00 0.00 85.00 102.00 129.00 ▇▁▂▆▅
open_acc 23 1.00 9.34 4.51 1.00 6.00 9.00 12.00 47.00 ▇▃▁▁▁
pub_rec 23 1.00 0.06 0.25 0.00 0.00 0.00 0.00 5.00 ▇▁▁▁▁
revol_bal 3 1.00 14310.00 22696.55 0.00 3644.00 8783.50 17187.00 1207359.00 ▇▁▁▁▁
revol_util 67 1.00 49.09 28.33 0.00 25.83 49.50 72.60 119.00 ▇▇▇▇▁
total_acc 23 1.00 22.08 11.59 1.00 13.00 20.00 29.00 81.00 ▇▇▂▁▁
out_prncp 3 1.00 11.80 123.53 0.00 0.00 0.00 0.00 3126.61 ▇▁▁▁▁
out_prncp_inv 3 1.00 11.76 123.23 0.00 0.00 0.00 0.00 3123.44 ▇▁▁▁▁
total_rec_late_fee 3 1.00 1.50 7.72 0.00 0.00 0.00 0.00 180.20 ▇▁▁▁▁
last_pymnt_amnt 3 1.00 2615.41 4373.70 0.00 211.02 531.98 3171.29 36115.20 ▇▁▁▁▁
collections_12_mths_ex_med 104 1.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 ▁▁▇▁▁
policy_code 3 1.00 1.00 0.00 1.00 1.00 1.00 1.00 1.00 ▁▁▇▁▁
acc_now_delinq 23 1.00 0.00 0.01 0.00 0.00 0.00 0.00 1.00 ▇▁▁▁▁
chargeoff_within_12_mths 104 1.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 ▁▁▇▁▁
delinq_amnt 23 1.00 0.20 35.09 0.00 0.00 0.00 0.00 6053.00 ▇▁▁▁▁
pub_rec_bankruptcies 966 0.97 0.05 0.21 0.00 0.00 0.00 0.00 2.00 ▇▁▁▁▁
tax_liens 79 1.00 0.00 0.01 0.00 0.00 0.00 0.00 1.00 ▇▁▁▁▁

Exploratory Analysis - Categorical

loan <- loan %>%
  mutate_if(is.character, as.factor) %>%
  mutate(loan_status = factor(loan_status))

loan %>%
  count(loan_status) %>%
  mutate(pct = n/sum(n)) -> default_rate

default_rate %>%
  ggplot(aes(x = loan_status, y = pct)) +
  geom_col() +
  geom_text(aes(label = round(pct, 4)), color = "red") +
  labs(title = "Loan Default Rate")

# -- explore factors
char_explore <- function(col){loan %>%
    na.omit() %>%
    ggplot(., aes(!!as.name(col))) +
    geom_bar(aes(fill=loan_status), position="fill") +
    theme(axis.text.x=element_text(angle=90, hjust=1)) +
    labs(title = col)
}

for (column in names(loan %>% select_if(is.factor))){
  chrt <- char_explore(column)
  print(chrt)
}

# -- term
loan %>%
  group_by(term, loan_status) %>%
  summarize(count = n()) %>%
  mutate(pct = count/sum(count)) %>%
  ggplot(aes(x = term, y = pct, fill = loan_status)) +
  geom_bar(stat = "identity") +
  scale_fill_manual(values = c("gray", "dodgerblue"), labels = c("current", "default")) +
  labs(title = "% Default, Term",
       x = "Term",
       y = "Percentage Current vs. Default",
       fill = "Loan Default") +
  geom_hline(yintercept = 0.1503509,
             linetype = "dashed",
             color = "black",
             size = 0.5) + coord_flip()

# -- grade
loan %>%
  group_by(grade, loan_status) %>%
  summarize(count = n()) %>%
  mutate(pct = count/sum(count)) %>%
  ggplot(aes(x = grade, y = pct, fill = loan_status)) +
  geom_bar(stat = "identity") +
  scale_fill_manual(values = c("gray", "dodgerblue"), labels = c("current", "default")) +
  labs(title = "% Default, Grade",
       x = "Grade",
       y = "Percentage Current vs. Default",
       fill = "Loan Default") +
  geom_hline(yintercept = 0.1503509,
             linetype = "dashed",
             color = "black",
             size = 0.5) + coord_flip()

# -- sub grade
loan %>%
  group_by(sub_grade, loan_status) %>%
  summarize(count = n()) %>%
  mutate(pct = count/sum(count)) %>%
  ggplot(aes(x = sub_grade, y = pct, fill = loan_status)) +
  geom_bar(stat = "identity") +
  scale_fill_manual(values = c("gray", "dodgerblue"), labels = c("current", "default")) +
  labs(title = "% Default, Sub Grade",
       x = "Sub Grade",
       y = "Percentage Current vs. Default",
       fill = "Loan Default") +
  geom_hline(yintercept = 0.1503509,
             linetype = "dashed",
             color = "black",
             size = 0.5) + coord_flip()

# -- emp_length
loan %>%
  group_by(emp_length, loan_status) %>%
  summarize(count = n()) %>%
  mutate(pct = count/sum(count)) %>%
  ggplot(aes(x = emp_length, y = pct, fill = loan_status)) +
  geom_bar(stat = "identity") +
  scale_fill_manual(values = c("gray", "dodgerblue"), labels = c("current", "default")) +
  labs(title = "% Default, Emp Length",
       x = "Emp Length",
       y = "Percentage Current vs. Default",
       fill = "Loan Default") +
  geom_hline(yintercept = 0.1503509,
             linetype = "dashed",
             color = "black",
             size = 0.5) + coord_flip()

# -- home ownership
loan %>%
  group_by(home_ownership, loan_status) %>%
  summarize(count = n()) %>%
  mutate(pct = count/sum(count)) %>%
  ggplot(aes(x = home_ownership, y = pct, fill = loan_status)) +
  geom_bar(stat = "identity") +
  scale_fill_manual(values = c("gray", "dodgerblue"), labels = c("current", "default")) +
  labs(title = "% Default, Home Ownership",
       x = "Home Ownership",
       y = "Percentage Current vs. Default",
       fill = "Loan Default") +
  geom_hline(yintercept = 0.1503509,
             linetype = "dashed",
             color = "black",
             size = 0.5) + coord_flip()

# -- verification status
loan %>%
  group_by(verification_status, loan_status) %>%
  summarize(count = n()) %>%
  mutate(pct = count/sum(count)) %>%
  ggplot(aes(x = verification_status, y = pct, fill = loan_status)) +
  geom_bar(stat = "identity") +
  scale_fill_manual(values = c("gray", "dodgerblue"), labels = c("current", "default")) +
  labs(title = "% Default, Verification Status",
       x = "Verification Status",
       y = "Percentage Current vs. Default",
       fill = "Loan Default") +
  geom_hline(yintercept = 0.1503509,
             linetype = "dashed",
             color = "black",
             size = 0.5) + coord_flip()

# -- payment plan
loan %>%
  group_by(pymnt_plan, loan_status) %>%
  summarize(count = n()) %>%
  mutate(pct = count/sum(count)) %>%
  ggplot(aes(x = pymnt_plan, y = pct, fill = loan_status)) +
  geom_bar(stat = "identity") +
  scale_fill_manual(values = c("gray", "dodgerblue"), labels = c("current", "default")) +
  labs(title = "% Default, Payment Plan",
       x = "Payment Plan",
       y = "Percentage Current vs. Default",
       fill = "Loan Default") +
  geom_hline(yintercept = 0.1503509,
             linetype = "dashed",
             color = "black",
             size = 0.5) + coord_flip()

# -- purpose
loan %>%
  group_by(purpose, loan_status) %>%
  summarize(count = n()) %>%
  mutate(pct = count/sum(count)) %>%
  ggplot(aes(x = purpose, y = pct, fill = loan_status)) +
  geom_bar(stat = "identity") +
  scale_fill_manual(values = c("gray", "dodgerblue"), labels = c("current", "default")) +
  labs(title = "% Default,Purpose",
       x = "Purpose",
       y = "Percentage Current vs. Default",
       fill = "Loan Default") +
  geom_hline(yintercept = 0.1503509,
             linetype = "dashed",
             color = "black",
             size = 0.5) + coord_flip()

# -- acc_now_delinq
loan %>%
  group_by(acc_now_delinq, loan_status) %>%
  summarize(count = n()) %>%
  mutate(pct = count/sum(count)) %>%
  ggplot(aes(x = acc_now_delinq, y = pct, fill = loan_status)) +
  geom_bar(stat = "identity") +
  scale_fill_manual(values = c("gray", "dodgerblue"), labels = c("current", "default")) +
  labs(title = "% Default, Account Now Delinq",
       x = "Account Now Delinq",
       y = "Percentage Current vs. Default",
       fill = "Loan Default") +
  geom_hline(yintercept = 0.1503509,
             linetype = "dashed",
             color = "black",
             size = 0.5) + coord_flip()

# -- bankruptcy
loan %>%
  group_by(pub_rec_bankruptcies, loan_status) %>%
  summarize(count = n()) %>%
  mutate(pct = count/sum(count)) %>%
  ggplot(aes(x = pub_rec_bankruptcies, y = pct, fill = loan_status)) +
  geom_bar(stat = "identity") +
  scale_fill_manual(values = c("gray", "dodgerblue"), labels = c("current", "default")) +
  labs(title = "% Default, Pub Rec Bankruptcies",
       x = "Pub Rec Bankruptcies",
       y = "Percentage Current vs. Default",
       fill = "Loan Default") +
  geom_hline(yintercept = 0.1503509,
             linetype = "dashed",
             color = "black",
             size = 0.5) + coord_flip()

# -- tax_liens
loan %>%
  group_by(tax_liens, loan_status) %>%
  summarize(count = n()) %>%
  mutate(pct = count/sum(count)) %>%
  ggplot(aes(x = tax_liens, y = pct, fill = loan_status)) +
  geom_bar(stat = "identity") +
  scale_fill_manual(values = c("gray", "dodgerblue"), labels = c("current", "default")) +
  labs(title = "% Default, tax_liens",
       x = "tax_liens",
       y = "Percentage Current vs. Default",
       fill = "Loan Default") +
  geom_hline(yintercept = 0.1503509,
             linetype = "dashed",
             color = "black",
             size = 0.5) + coord_flip()

# -- policy code
loan %>%
  group_by(policy_code, loan_status) %>%
  summarize(count = n()) %>%
  mutate(pct = count/sum(count)) %>%
  ggplot(aes(x = policy_code, y = pct, fill = loan_status)) +
  geom_bar(stat = "identity") +
  scale_fill_manual(values = c("gray", "dodgerblue"), labels = c("current", "default")) +
  labs(title = "% Default, Policy Code",
       x = "Policy Code",
       y = "Percentage Current vs. Default",
       fill = "Loan Default") +
  geom_hline(yintercept = 0.1503509,
             linetype = "dashed",
             color = "black",
             size = 0.5) + coord_flip()

# -- address state
loan %>%
  group_by(addr_state, loan_status) %>%
  summarize(count = n()) %>%
  mutate(pct = count/sum(count)) %>%
  ggplot(aes(x = addr_state, y = pct, fill = loan_status)) +
  geom_bar(stat = "identity") +
  scale_fill_manual(values = c("gray", "dodgerblue"), labels = c("current", "default")) +
  labs(title = "% Default, Address State",
       x = "Address State",
       y = "Percentage Current vs. Default",
       fill = "Loan Default") +
  geom_hline(yintercept = 0.1503509,
             linetype = "dashed",
             color = "black",
             size = 0.5) + coord_flip()

Exploratory Analysis - Numeric

n_cols <- names(loan %>% select_if(is.numeric) %>% dplyr::select(-id, -member_id))
my_hist <- function(col){
  loan %>%
    summarise(n = n(),
              n_miss = sum(is.na(!!as.name(col))),
              n_dist = n_distinct(!!as.name(col)),
              mean = round(mean(!!as.name(col), na.rm = TRUE), 2),
              min = min(!!as.name(col), na.rm = TRUE),
              max = max(!!as.name(col), na.rm = TRUE)) -> col_summary
  
  p <- ggtexttable(col_summary, rows = NULL,
                   theme = ttheme("mOrange"))
  
  l1 <- loan %>%
    ggplot(aes(x = !!as.name(col), fill = loan_status)) +
    geom_histogram(bins = 30) +
    labs(title = col)
  
  plt <- ggarrange(l1, p, ncol = 1, nrow = 2, heights = c(1, 0.3))
  print(plt)
}

for (c in n_cols){
  my_hist(c)
}

# -- correlation matrix
cor <- loan %>%
  na.omit() %>%
  select_if(is.numeric) %>%
  cor() %>%
  as.data.frame() %>%
  rownames_to_column(var = "variable")

cor %>%
  pivot_longer(cols = c("loan_amnt",
                        "funded_amnt",
                        "funded_amnt_inv",
                        "installment",
                        "annual_inc",
                        "dti",
                        "delinq_2yrs",
                        "fico_range_low",
                        "fico_range_high",
                        "inq_last_6mths",
                        "mths_since_last_delinq",
                        "mths_since_last_record",
                        "open_acc",
                        "pub_rec",
                        "revol_bal",
                        "total_acc",
                        "out_prncp",
                        "out_prncp_inv",
                        "total_rec_late_fee",
                        "delinq_amnt"),
               names_to = "name",
               values_to = "correlation") %>%
  ggplot(aes(x = variable, y = name, fill = correlation)) +
  geom_tile() +
  labs(title = "Correlation Matrix",
       x = "Variable",
       y = "Variable") +
  scale_fill_gradient2(mid = "#FBFEF9",
                       low = "#0C6291",
                       high = "#A63446") +
  theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1)) +
  geom_text(aes(label = round(correlation, 2)), color = "Black", size = 1.5)

Partition Data

set.seed(123)

split <- initial_split(loan, prop = 0.7, strata = loan_status)
train <- training(split)
test <- testing(split)

sprintf("Train PCT : %1.2f%%", nrow(train)/nrow(loan) * 100)
## [1] "Train PCT : 70.00%"
sprintf("Test PCT  : %1.2f%%", nrow(test)/nrow(loan) * 100)
## [1] "Test PCT  : 30.00%"
loan_fold <- train %>% sample_frac(0.2)

Recipe

loan_recipe <- recipe(loan_status ~ ., data = train) %>%
  step_rm(emp_title, url, desc, title, zip_code, earliest_cr_line, last_pymnt_d, addr_state,
          issue_d, next_pymnt_d, last_credit_pull_d, id, member_id) %>%
  step_impute_median(all_numeric_predictors()) %>%
  step_unknown(all_nominal_predictors()) %>%
  step_dummy(all_nominal_predictors(), one_hot = TRUE) %>%
  themis::step_downsample(loan_status, under_ratio = 3) %>%
  step_nzv(all_nominal_predictors())

Models & Workflows

# -- Logistic Model
log_model <- logistic_reg() %>%
  set_mode("classification") %>%
  set_engine("glm")

log_workflow_fit <- workflow() %>%
  add_recipe(loan_recipe) %>%
  add_model(log_model) %>%
  fit(train)

# -- XGBoost Model
xgb_model <- boost_tree(trees = 20) %>%
  set_engine("xgboost") %>%
  set_mode("classification")

xgb_workflow_fit <- workflow() %>%
  add_recipe(loan_recipe) %>%
  add_model(xgb_model) %>%
  fit(train)

# -- Random Forest Model
rf_model <- rand_forest(trees = 100) %>%
  set_engine("ranger", num.thread = 8, importance = "permutation") %>%
  set_mode("classification")

rf_workflow_fit <- workflow() %>%
  add_recipe(loan_recipe) %>%
  add_model(rf_model) %>%
  fit(train)

Model Evaluation

evaluate_models <- function(model_workflow, model_name){
  # -- make predictions
  score_train <- bind_cols(
    predict(model_workflow, train, type = "prob"),
    predict(model_workflow, train, type = "class"),
    train) %>%
    mutate(part = "train")
  
  score_test <- bind_cols(
    predict(model_workflow, test, type = "prob"),
    predict(model_workflow, test, type = "class"),
    test) %>%
    mutate(part = "test")
  
  options(yardstick.event_first = FALSE)
  
  bind_rows(score_train, score_test) %>%
    group_by(part) %>%
    metrics(loan_status, .pred_default, estimate = .pred_class) %>%
    pivot_wider(id_cols = part, names_from = .metric, values_from = .estimate) %>%
    mutate(model_name = model_name) %>% print()
  
  # -- precision
  bind_rows(score_train, score_test) %>%
    group_by(part) %>%
    precision(loan_status, .pred_class) %>%
    mutate(model_name = model_name) %>% print()

# -- recall
  bind_rows(score_train, score_test) %>%
    group_by(part) %>%
    recall(loan_status, .pred_class) %>%
    mutate(model_name = model_name) %>% print()
  
  # -- roc curve
  bind_rows(score_train, score_test) %>%
    group_by(part) %>%
    roc_curve(truth = loan_status, predicted = .pred_default) %>%
    autoplot() +
    geom_vline(xintercept = 0.04, color = "red", linetype = "longdash") +
    geom_vline(xintercept = 0.20, color = "black", linetype = "longdash") +
    labs(title = model_name, x = "FPR(1 - specificity)", y = "TPR(recall)") -> roc_chart
  
  print(roc_chart)
  
  # -- operating range 0 - 10%
  operating_range <- score_test %>%
    roc_curve(loan_status, .pred_default) %>%
    mutate(fpr = 1 - round(specificity, 2),
           tpr = round(sensitivity, 3),
           threshold = 1 - round(.threshold, 3)) %>%
    #dplyr::select(fpr, tpr, threshold) %>%
    group_by(fpr) %>%
    summarise(threshold = round(mean(threshold), 3),
              tpr = mean(tpr)) %>%
    #mutate(precision = tpr/(fpr+tpr)) %>%
    #mutate_if(is.numeric, round, 2) %>%
    arrange(fpr) %>%
    filter(fpr <= 0.1)
  
  print(operating_range)
  
  # -- score distribution
  score_test %>%
    ggplot(aes(.pred_default, fill = loan_status)) +
    geom_histogram(bins = 50) +
    geom_vline(aes(xintercept = .5, color = "red")) +
    geom_vline(aes(xintercept = .3, color = "green")) +
    geom_vline(aes(xintercept = .7, color = "blue")) +
    labs(title = model_name) -> score_dist
  
  print(score_dist)
  
  # -- variable importance
  model_workflow %>%
    extract_fit_parsnip() %>%
    vip(30) +
    labs(model_name) -> vip_model
  
  print(vip_model)
}

evaluate_models(xgb_workflow_fit, "XGBoost Model")
## # A tibble: 2 × 6
##   part  accuracy   kap mn_log_loss roc_auc model_name   
##   <chr>    <dbl> <dbl>       <dbl>   <dbl> <chr>        
## 1 test     0.875 0.509       0.278   0.906 XGBoost Model
## 2 train    0.898 0.596       0.249   0.934 XGBoost Model
## # A tibble: 2 × 5
##   part  .metric   .estimator .estimate model_name   
##   <chr> <chr>     <chr>          <dbl> <chr>        
## 1 test  precision binary         0.586 XGBoost Model
## 2 train precision binary         0.665 XGBoost Model
## # A tibble: 2 × 5
##   part  .metric .estimator .estimate model_name   
##   <chr> <chr>   <chr>          <dbl> <chr>        
## 1 test  recall  binary         0.579 XGBoost Model
## 2 train recall  binary         0.647 XGBoost Model

## # A tibble: 11 × 3
##       fpr threshold    tpr
##     <dbl>     <dbl>  <dbl>
##  1 0       -Inf     0.0990
##  2 0.0100     0.221 0.236 
##  3 0.0200     0.287 0.318 
##  4 0.0300     0.346 0.387 
##  5 0.0400     0.39  0.446 
##  6 0.0500     0.427 0.488 
##  7 0.0600     0.458 0.524 
##  8 0.0700     0.491 0.567 
##  9 0.0800     0.524 0.612 
## 10 0.09       0.55  0.645 
## 11 0.1        0.572 0.668

evaluate_models(rf_workflow_fit, "Random Forest Model")
## # A tibble: 2 × 6
##   part  accuracy   kap mn_log_loss roc_auc model_name         
##   <chr>    <dbl> <dbl>       <dbl>   <dbl> <chr>              
## 1 test     0.867 0.420       0.325   0.878 Random Forest Model
## 2 train    0.966 0.872       0.206   0.993 Random Forest Model
## # A tibble: 2 × 5
##   part  .metric   .estimator .estimate model_name         
##   <chr> <chr>     <chr>          <dbl> <chr>              
## 1 test  precision binary         0.580 Random Forest Model
## 2 train precision binary         0.860 Random Forest Model
## # A tibble: 2 × 5
##   part  .metric .estimator .estimate model_name         
##   <chr> <chr>   <chr>          <dbl> <chr>              
## 1 test  recall  binary         0.431 Random Forest Model
## 2 train recall  binary         0.925 Random Forest Model

## # A tibble: 11 × 3
##       fpr threshold    tpr
##     <dbl>     <dbl>  <dbl>
##  1 0       -Inf     0.0559
##  2 0.0100     0.349 0.159 
##  3 0.0200     0.402 0.249 
##  4 0.0300     0.438 0.318 
##  5 0.0400     0.464 0.367 
##  6 0.0500     0.488 0.407 
##  7 0.0600     0.509 0.453 
##  8 0.0700     0.528 0.500 
##  9 0.0800     0.548 0.537 
## 10 0.09       0.563 0.564 
## 11 0.1        0.576 0.590

evaluate_models(log_workflow_fit, "Logistic Model")
## # A tibble: 2 × 6
##   part  accuracy   kap mn_log_loss roc_auc model_name    
##   <chr>    <dbl> <dbl>       <dbl>   <dbl> <chr>         
## 1 test     0.857 0.426       0.321   0.871 Logistic Model
## 2 train    0.862 0.440       0.315   0.874 Logistic Model
## # A tibble: 2 × 5
##   part  .metric   .estimator .estimate model_name    
##   <chr> <chr>     <chr>          <dbl> <chr>         
## 1 test  precision binary         0.527 Logistic Model
## 2 train precision binary         0.544 Logistic Model
## # A tibble: 2 × 5
##   part  .metric .estimator .estimate model_name    
##   <chr> <chr>   <chr>          <dbl> <chr>         
## 1 test  recall  binary         0.494 Logistic Model
## 2 train recall  binary         0.499 Logistic Model

## # A tibble: 11 × 3
##       fpr threshold    tpr
##     <dbl>     <dbl>  <dbl>
##  1 0       -Inf     0.0247
##  2 0.0100     0.178 0.105 
##  3 0.0200     0.267 0.203 
##  4 0.0300     0.328 0.284 
##  5 0.0400     0.373 0.342 
##  6 0.0500     0.413 0.385 
##  7 0.0600     0.447 0.426 
##  8 0.0700     0.477 0.468 
##  9 0.0800     0.502 0.499 
## 10 0.09       0.522 0.525 
## 11 0.1        0.543 0.554

XGBoost Evaluation

options(yardstick.event_first = FALSE)

# -- training
predict(xgb_workflow_fit, train, type = "prob") %>%
  bind_cols(predict(xgb_workflow_fit, train, type = "class")) %>%
  mutate(part = "train") %>%
  bind_cols(., train) -> xgb_scored_train

# -- testing
predict(xgb_workflow_fit, test, type = "prob") %>%
  bind_cols(predict(xgb_workflow_fit, test, type = "class")) %>%
  mutate(part = "test") %>%
  bind_cols(., test) -> xgb_scored_test

# -- Confusion Matrix
xgb_scored_train %>%
  conf_mat(loan_status, .pred_class) %>%
  autoplot(type = "heatmap") +
  labs(title = "Train Confusion Matrix")

xgb_scored_test %>%
  conf_mat(loan_status, .pred_class) %>%
  autoplot(type = "heatmap") +
  labs(title = "Test Confusion Matrix")

Kaggle

loan_kaggle <- read_csv("loan_holdout.csv", na = c("NA", "", "-")) %>% 
  mutate(int_rate = as.numeric(str_replace(int_rate, "%", "")),
         revol_util = as.numeric(str_replace(revol_util, "%", "")),
         empl_year = word(emp_length)) %>%
  mutate_if(is.character, as.factor) %>%
  clean_names()
## Rows: 12761 Columns: 51
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (22): term, int_rate, grade, sub_grade, emp_title, emp_length, home_owne...
## dbl (29): id, member_id, loan_amnt, funded_amnt, funded_amnt_inv, installmen...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
skim(loan_kaggle)
Data summary
Name loan_kaggle
Number of rows 12761
Number of columns 52
_______________________
Column type frequency:
factor 21
numeric 31
________________________
Group variables None

Variable type: factor

skim_variable n_missing complete_rate ordered n_unique top_counts
term 0 1.00 FALSE 2 36 : 9374, 60 : 3387
grade 0 1.00 FALSE 7 B: 3769, A: 3041, C: 2672, D: 1748
sub_grade 0 1.00 FALSE 35 B3: 909, B5: 875, A4: 861, A5: 836
emp_title 804 0.94 FALSE 10242 US : 44, Ban: 32, AT&: 22, IBM: 21
emp_length 0 1.00 FALSE 12 10+: 2792, < 1: 1571, 2 y: 1430, 3 y: 1356
home_ownership 0 1.00 FALSE 5 REN: 6117, MOR: 5619, OWN: 976, OTH: 45
verification_status 0 1.00 FALSE 3 Not: 5630, Ver: 4011, Sou: 3120
issue_d 0 1.00 FALSE 55 Dec: 686, Nov: 664, Oct: 651, Sep: 614
pymnt_plan 0 1.00 FALSE 1 n: 12761
url 0 1.00 FALSE 12761 htt: 1, htt: 1, htt: 1, htt: 1
desc 4088 0.68 FALSE 8664 Deb: 4, Cam: 3, con: 2, con: 2
purpose 0 1.00 FALSE 14 deb: 5960, cre: 1627, oth: 1317, hom: 973
title 1 1.00 FALSE 7073 Deb: 702, Deb: 536, Per: 194, Con: 177
zip_code 0 1.00 FALSE 760 100: 191, 900: 161, 112: 154, 945: 152
addr_state 0 1.00 FALSE 50 CA: 2241, NY: 1229, FL: 904, TX: 848
earliest_cr_line 9 1.00 FALSE 474 Oct: 117, Dec: 115, Oct: 115, Dec: 110
last_pymnt_d 19 1.00 FALSE 105 Mar: 324, Dec: 313, Apr: 284, May: 281
next_pymnt_d 11817 0.07 FALSE 88 Oct: 163, Apr: 29, Feb: 29, Mar: 28
last_credit_pull_d 2 1.00 FALSE 107 Sep: 4908, Mar: 277, Aug: 233, Jul: 196
application_type 0 1.00 FALSE 1 IND: 12761
empl_year 0 1.00 FALSE 12 10+: 2792, <: 1571, 2: 1430, 3: 1356

Variable type: numeric

skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist
id 0 1.00 668251.54 217909.00 55521.00 502532.00 651269.00 826140.00 1077175.00 ▁▃▇▇▅
member_id 0 1.00 830682.55 277425.28 70626.00 645610.00 832879.00 1034801.00 1313524.00 ▁▃▇▇▆
loan_amnt 0 1.00 11043.73 7425.67 500.00 5000.00 9600.00 15000.00 35000.00 ▇▆▃▂▁
funded_amnt 0 1.00 10770.14 7146.61 500.00 5000.00 9500.00 15000.00 35000.00 ▇▇▃▂▁
funded_amnt_inv 0 1.00 10116.91 7133.85 0.00 4903.99 8400.00 14000.00 35000.00 ▇▆▃▁▁
int_rate 0 1.00 12.16 3.69 5.42 9.62 11.99 14.72 24.59 ▅▇▆▂▁
installment 0 1.00 319.86 206.93 16.85 164.46 274.63 424.60 1269.73 ▇▆▂▁▁
annual_inc 3 1.00 68985.62 57925.95 1896.00 40000.00 59000.00 82435.00 2039784.00 ▇▁▁▁▁
dti 0 1.00 13.35 6.70 0.00 8.23 13.43 18.63 29.96 ▅▇▇▆▁
delinq_2yrs 9 1.00 0.15 0.48 0.00 0.00 0.00 0.00 9.00 ▇▁▁▁▁
fico_range_low 0 1.00 713.05 35.90 620.00 685.00 710.00 740.00 825.00 ▁▇▇▃▁
fico_range_high 0 1.00 717.05 35.90 624.00 689.00 714.00 744.00 829.00 ▁▇▇▃▁
inq_last_6mths 9 1.00 1.08 1.49 0.00 0.00 1.00 2.00 27.00 ▇▁▁▁▁
mths_since_last_delinq 8022 0.37 35.71 22.64 0.00 17.00 34.00 52.00 115.00 ▇▇▅▂▁
mths_since_last_record 11679 0.08 59.06 47.00 0.00 0.00 85.00 101.00 120.00 ▇▁▂▅▇
open_acc 9 1.00 9.36 4.46 1.00 6.00 9.00 12.00 36.00 ▇▇▂▁▁
pub_rec 9 1.00 0.06 0.24 0.00 0.00 0.00 0.00 4.00 ▇▁▁▁▁
revol_bal 0 1.00 14269.54 20349.43 0.00 3625.00 8903.00 17345.00 602519.00 ▇▁▁▁▁
revol_util 26 1.00 49.19 28.45 0.00 25.50 50.10 72.80 105.30 ▇▇▇▇▅
total_acc 9 1.00 22.22 11.60 1.00 14.00 21.00 29.00 90.00 ▇▇▂▁▁
out_prncp 0 1.00 11.97 129.59 0.00 0.00 0.00 0.00 3555.85 ▇▁▁▁▁
out_prncp_inv 0 1.00 11.94 129.24 0.00 0.00 0.00 0.00 3553.30 ▇▁▁▁▁
total_rec_late_fee 0 1.00 1.54 8.07 0.00 0.00 0.00 0.00 209.00 ▇▁▁▁▁
last_pymnt_amnt 0 1.00 2606.45 4412.08 0.00 212.04 518.35 3162.89 35471.86 ▇▁▁▁▁
collections_12_mths_ex_med 44 1.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 ▁▁▇▁▁
policy_code 0 1.00 1.00 0.00 1.00 1.00 1.00 1.00 1.00 ▁▁▇▁▁
acc_now_delinq 9 1.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 ▁▁▇▁▁
chargeoff_within_12_mths 44 1.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 ▁▁▇▁▁
delinq_amnt 9 1.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 ▁▁▇▁▁
pub_rec_bankruptcies 402 0.97 0.04 0.21 0.00 0.00 0.00 0.00 2.00 ▇▁▁▁▁
tax_liens 29 1.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 ▁▁▇▁▁
predict(xgb_workflow_fit, loan_kaggle, type = "prob") %>%
  bind_cols(.,loan_kaggle) %>%
  dplyr::select(id, loan_status = .pred_default) %>%
  write_csv("final_kaggle_submission.csv")

Top Predictions

top_tp <- xgb_scored_test %>%
  filter(.pred_class == loan_status) %>%
  filter(loan_status == "default") %>%
  slice_max(order_by = .pred_default, n = 10)

top_tn <- xgb_scored_test %>%
  filter(.pred_class == loan_status) %>%
  filter(loan_status == "current") %>%
  slice_max(order_by = .pred_default, n = 10)

top_fp <- xgb_scored_test %>%
  filter(.pred_class != loan_status) %>%
  filter(loan_status == "current") %>%
  slice_max(order_by = .pred_default, n = 10)

bottom_fn <- xgb_scored_test %>%
  filter(.pred_class != loan_status) %>%
  filter(loan_status == "default") %>%
  slice_min(order_by = .pred_default, n = 10)

Partial Dependence Plot

# -- try step_profile
grid <- recipe(loan_status ~ ., data = train) %>%
  step_profile(all_predictors(), -last_pymnt_amnt, profile = vars(last_pymnt_amnt)) %>%
  prep() %>% juice()

predict(xgb_workflow_fit, grid, type = "prob") %>%
  bind_cols(grid %>% dplyr::select(last_pymnt_amnt)) %>%
  ggplot(aes(y = .pred_default, x = last_pymnt_amnt)) +
  geom_path() + stat_smooth() +
  labs(title = "Partial Dependence Plot - Last Payment Amount")
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'

# -- create exlainer object
xgb_explainer <- explain_tidymodels(xgb_workflow_fit, data = train,
                                    y = train$loan_status, verbose = TRUE)
## Preparation of a new explainer is initiated
##   -> model label       :  workflow  (  default  )
##   -> data              :  20843  rows  53  cols 
##   -> data              :  tibble converted into a data.frame 
##   -> target variable   :  20843  values 
##   -> predict function  :  yhat.workflow  will be used (  default  )
##   -> predicted values  :  No value for predict function target column. (  default  )
##   -> model_info        :  package tidymodels , ver. 1.0.0 , task classification (  default  ) 
##   -> model_info        :  Model info detected classification task but 'y' is a factor .  (  WARNING  )
##   -> model_info        :  By deafult classification tasks supports only numercical 'y' parameter. 
##   -> model_info        :  Consider changing to numerical vector with 0 and 1 values.
##   -> model_info        :  Otherwise I will not be able to calculate residuals or loss function.
##   -> predicted values  :  numerical, min =  0.001366913 , mean =  0.2094293 , max =  0.9826499  
##   -> residual function :  difference between y and yhat (  default  )
##   -> residuals         :  numerical, min =  NA , mean =  NA , max =  NA  
##   A new explainer has been created!
# -- profile the variable of interest
pdp_last_pymnt_amnt <- model_profile(xgb_explainer, variables = "last_pymnt_amnt")

# -- plot it
plot(pdp_last_pymnt_amnt) + labs(title = "Partial Dependence Plot",
                                 subtitle = " ",
                                 x = "last_pymnt_amnt",
                                 y = "Average Impact on Prediction")

# -- PDP: late fees received to date
pdp_total_rec_late_fee <- model_profile(xgb_explainer, variables = "total_rec_late_fee")
plot(pdp_total_rec_late_fee) + labs(title = "Partial Dependence Plot",
                                    subtitle = " ",
                                    x = "Late Fees Received to Date",
                                    y = "Average Impact on Prediction")

# -- PDP: interest rate on the loan
pdp_int_rate <- model_profile(xgb_explainer, variables = "int_rate")
plot(pdp_int_rate) + labs(title = "Partial Dependence Plot",
                          subtitle = " ",
                          x = "Interest Rate on the Loan",
                          y = "Average Impact on Prediction")

# -- PDP: monthly payment owed by the borrower if the loan originates
pdp_installment <- model_profile(xgb_explainer, variables = "installment")
plot(pdp_installment) + labs(title = "Partial Dependence Plot",
                             subtitle = " ",
                             x = "Monthly Payment Owed by Borrower (if the loan originates)",
                             y = "Average Impact on Prediction")

# -- PDP: remaining outstanding principal for total amount funded
pdp_out_prncp <- model_profile(xgb_explainer, variables = "out_prncp")
plot(pdp_out_prncp) + labs(title = "Partial Dependence Plot",
                           subtitle = " ",
                           x = "Remaining Outstanding Principal",
                           y = "Average Impact on Prediction")

# -- PDP: total amount committed by investors for that loan at that point in time
pdp_funded_amnt_inv <- model_profile(xgb_explainer, variables = "funded_amnt_inv")
plot(pdp_funded_amnt_inv) + labs(title = "Partial Dependence Plot",
                                 subtitle = " ",
                                 x = "Total Amount Committed by Investors",
                                 y = "Average Impact on Prediction")

# -- PDP: self-reported annual income provided by the borrower during registration
pdp_annual_inc <- model_profile(xgb_explainer, variables = "annual_inc")
plot(pdp_annual_inc) + labs(title = "Partial Dependence Plot",
                            subtitle = " ",
                            x = "Borrower's Annual Income",
                            y = "Average Impact on Prediction")

# -- PDP: listed amount of the loan applied for by the borrower
pdp_loan_amnt <- model_profile(xgb_explainer, variables = "loan_amnt")
plot(pdp_loan_amnt) + labs(title = "Partial Dependence Plot",
                           subtitle = " ",
                           x = "Amount of Loan",
                           y = "Average Impact on Prediction")

Shap & Breakdown Plots

tidy_explainer <- explain_tidymodels(xgb_workflow_fit, data = test,
                                     y = test$loan_status, label = "tidymodels")
## Preparation of a new explainer is initiated
##   -> model label       :  tidymodels 
##   -> data              :  8934  rows  53  cols 
##   -> data              :  tibble converted into a data.frame 
##   -> target variable   :  8934  values 
##   -> predict function  :  yhat.workflow  will be used (  default  )
##   -> predicted values  :  No value for predict function target column. (  default  )
##   -> model_info        :  package tidymodels , ver. 1.0.0 , task classification (  default  ) 
##   -> model_info        :  Model info detected classification task but 'y' is a factor .  (  WARNING  )
##   -> model_info        :  By deafult classification tasks supports only numercical 'y' parameter. 
##   -> model_info        :  Consider changing to numerical vector with 0 and 1 values.
##   -> model_info        :  Otherwise I will not be able to calculate residuals or loss function.
##   -> predicted values  :  numerical, min =  0.001402915 , mean =  0.2128489 , max =  0.9764078  
##   -> residual function :  difference between y and yhat (  default  )
## Warning in Ops.factor(y, predict_function(model, data)): '-' not meaningful for
## factors
##   -> residuals         :  numerical, min =  NA , mean =  NA , max =  NA  
##   A new explainer has been created!
shap_explain <- predict_parts(tidy_explainer, top_tp %>% head(1), type = "shap")
plot(shap_explain)

score <- 0.67

as_tibble(shap_explain) %>%
  group_by(variable) %>%
  summarise(contribution = sum(contribution)) %>%
  top_n(wt = abs(contribution), 10) %>%
  mutate(pos_neg = if_else(contribution < 0, "neg", "pos")) %>%
  arrange(desc(contribution)) %>%
  ggplot(aes(x = contribution, y = reorder(variable, contribution), fill = pos_neg)) +
  geom_col() +
  labs(title = paste("Shap Explainer, Predicted Score:", score))

breakdown_explain <- predict_parts(tidy_explainer, top_tp %>% head(1),
                                   type = "break_down_interactions")
plot(breakdown_explain) + labs(title = paste("Breakdown Plot, Predicted Score:", score))

# -- true postive
breakdown_explainer <- function(row){
  breakdown_explain <- predict_parts(tidy_explainer, row, type = "break_down_interactions")
  plot(breakdown_explain) +
  labs(title = paste("Breakdown Plot, Predicted Score:", round(row$.pred_default, 3)))
}

for (row in 1:2){
  dat <- top_tp[row,]
  print(breakdown_explainer(dat))
}

for (row in 3:3){
  dat <- top_tp[row,]
  print(breakdown_explainer(dat))
}

# -- false positive
shap_explain <- predict_parts(tidy_explainer, top_fp %>% head(1), type = "shap")
plot(shap_explain)

as_tibble(shap_explain) %>%
  group_by(variable) %>%
  summarise(contribution = sum(contribution)) %>%
  top_n(wt = abs(contribution), 10) %>%
  mutate(pos_neg = if_else(contribution < 0, "neg", "pos")) %>%
  arrange(desc(contribution)) %>%
  ggplot(aes(x = contribution, y = reorder(variable, contribution), fill = pos_neg)) +
  geom_col() +
  labs(title = paste("Shap Explainer, Predicted Score:", score))

for (row in 1:3){
  dat <- top_fp[row,]
  print(breakdown_explainer(dat))
}

# -- true negative
shap_explain <- predict_parts(tidy_explainer, top_tn %>% head(1), type = "shap")
plot(shap_explain)

as_tibble(shap_explain) %>%
  group_by(variable) %>%
  summarise(contribution = sum(contribution)) %>%
  top_n(wt = abs(contribution), 10) %>%
  mutate(pos_neg = if_else(contribution < 0, "neg", "pos")) %>%
  arrange(desc(contribution)) %>%
  ggplot(aes(x = contribution, y = reorder(variable, contribution), fill = pos_neg)) +
  geom_col() +
  labs(title = paste("Shap Explainer, Predicted Score:", score))

for (row in 1:3){
  dat <- top_tn[row,]
  print(breakdown_explainer(dat))
}

# -- false negative
shap_explain <- predict_parts(tidy_explainer, bottom_fn %>% head(1), type = "shap")
plot(shap_explain)

as_tibble(shap_explain) %>%
  group_by(variable) %>%
  summarise(contribution = sum(contribution)) %>%
  top_n(wt = abs(contribution), 10) %>%
  mutate(pos_neg = if_else(contribution < 0, "neg", "pos")) %>%
  arrange(desc(contribution)) %>%
  ggplot(aes(x = contribution, y = reorder(variable, contribution), fill = pos_neg)) +
  geom_col() +
  labs(title = paste("Shap Explainer, Predicted Score:", score))

for (row in 1:3){
  dat <- bottom_fn[row,]
  print(breakdown_explainer(dat))
}

Isolation Recipe

# -- deal w. categorical 
iso_recipe <- recipe(~.,loan) %>%
  step_rm(id,member_id,emp_title,issue_d,url,desc,title,zip_code,earliest_cr_line,last_pymnt_d,
          next_pymnt_d,last_credit_pull_d) %>%
  step_unknown(all_nominal_predictors()) %>%
  step_novel(all_nominal_predictors()) %>%
  step_impute_median(all_numeric_predictors()) %>%
  step_dummy(all_nominal_predictors())

iso_prep <- bake(iso_recipe %>% prep(), loan)

# -- deal w. numeric
iso_numeric <- loan %>% select_if(is.numeric)

iso_recipe <- recipe(~.,iso_numeric) %>%
  step_rm(id, member_id) %>%
  step_impute_median(all_numeric()) %>%
  prep()

Isolation Forest

iso_forest <- isolationForest$new(sample_size = 256,
                                  num_trees = 100,
                                  max_depth = ceiling(log2(256)))

iso_forest$fit(iso_prep)
## INFO  [17:13:11.288] dataset has duplicated rows
## INFO  [17:13:11.354] Building Isolation Forest ...
## INFO  [17:13:12.906] done
## INFO  [17:13:12.908] Computing depth of terminal nodes ...
## INFO  [17:13:13.333] done
## INFO  [17:13:15.124] Completed growing isolation forest

Predict Trainning

pred_train <- iso_forest$predict(iso_prep)

pred_train %>%
  summarise(n = n(),
            min = min(average_depth),
            max = max(average_depth),
            mean = mean(average_depth),
            min_score = min(anomaly_score),
            max_score = max(anomaly_score),
            mean_score = mean(anomaly_score))
# -- average tree depth
pred_train %>%
  ggplot(aes(average_depth)) +
  geom_histogram(bins=20) +
  geom_vline(xintercept = 7.4, linetype = "dotted",
             color = "blue", size = 1.5) + 
  labs(title = "Isolation Forest Tree Depth")

# -- average anomaly score
pred_train %>%
  ggplot(aes(anomaly_score)) +
  geom_histogram(bins=20) +
  geom_vline(xintercept = 0.607, linetype = "dotted",
             color = "blue", size = 1.5) +
  labs(title = "Isolation Forest Anomaly Score")

Create a Data Frame with Anomaly Flag

train_pred <- bind_cols(iso_forest$predict(iso_prep), iso_prep) %>%
  mutate(anomaly = as.factor(if_else(average_depth <= 7.4, "Anomaly", "Normal")))

train_pred %>%
  arrange(average_depth) %>%
  count(anomaly)
head(train_pred, 10)

Fit a Tree

fmla <- as.formula(paste("anomaly ~ ", paste(iso_prep %>% colnames(), collapse= "+")))

outlier_tree <- decision_tree(min_n = 2, tree_depth = 3,
                              cost_complexity = .01) %>%
  set_mode("classification") %>%
  set_engine("rpart") %>%
  fit(fmla, data = train_pred)

outlier_tree$fit
## n= 29777 
## 
## node), split, n, loss, yval, (yprob)
##       * denotes terminal node
## 
## 1) root 29777 236 Normal (0.007925580 0.992074420)  
##   2) addr_state_AZ>=0.5 667  56 Normal (0.083958021 0.916041979)  
##     4) int_rate>=18.63 41  20 Normal (0.487804878 0.512195122)  
##       8) last_pymnt_amnt>=692.905 26   7 Anomaly (0.730769231 0.269230769) *
##       9) last_pymnt_amnt< 692.905 15   1 Normal (0.066666667 0.933333333) *
##     5) int_rate< 18.63 626  36 Normal (0.057507987 0.942492013) *
##   3) addr_state_AZ< 0.5 29110 180 Normal (0.006183442 0.993816558) *
# -- plotting decision trees
library(rpart.plot)
rpart.plot(outlier_tree$fit, clip.right.labs = FALSE,
           branch = .3, under = TRUE, roundint = FALSE, extra = 3)

Global Anomaly Rules

Convert into English rules

anomaly_rules <- rpart.rules(outlier_tree$fit, roundint = FALSE,
                             extra = 4, cover = TRUE, clip.facs = TRUE) %>%
  clean_names() %>%
  mutate(rule = "IF")

rule_cols <- anomaly_rules %>% dplyr::select(starts_with("x_")) %>% colnames()

for (col in rule_cols){anomaly_rules <- anomaly_rules %>%
  mutate(rule = paste(rule, !!as.name(col)))
}

anomaly_rules %>%
  as.data.frame() %>%
  filter(anomaly == "Anomaly") %>%
  mutate(rule = paste(rule, " THEN ", anomaly)) %>%
  mutate(rule = paste(rule, " coverage ", cover)) %>%
  dplyr::select(rule)
anomaly_rules %>%
  as.data.frame() %>%
  filter(anomaly == "Normal") %>%
  mutate(rule = paste(rule, " THEN ", anomaly)) %>%
  mutate(rule = paste(rule, " coverage ", cover)) %>%
  dplyr::select(rule)
pred_train <- bind_cols(iso_forest$predict(iso_prep), iso_prep)

pred_train %>%
  arrange(desc(anomaly_score)) %>%
  filter(average_depth <= 7.4)

Local Anomaly Rules

fmla <- as.formula(paste("anomaly ~ ", paste(iso_prep %>% colnames(), collapse= "+")))

# -- identify observation as anomaly
pred_train %>%
  mutate(anomaly = as.factor(if_else(id==28006, "Anomaly", "Normal"))) -> local_df

local_tree <-  decision_tree(mode = "classification", tree_depth = 4,
                             min_n = 1, cost_complexity = 0.01) %>%
  set_engine("rpart") %>%
  fit(fmla, local_df)

local_tree$fit
## n= 29777 
## 
## node), split, n, loss, yval, (yprob)
##       * denotes terminal node
## 
## 1) root 29777 1 Normal (3.358297e-05 9.999664e-01)  
##   2) addr_state_OH>=0.5 912 1 Normal (1.096491e-03 9.989035e-01)  
##     4) annual_inc>=159998 21 1 Normal (4.761905e-02 9.523810e-01)  
##       8) sub_grade_C2>=0.5 1 0 Anomaly (1.000000e+00 0.000000e+00) *
##       9) sub_grade_C2< 0.5 20 0 Normal (0.000000e+00 1.000000e+00) *
##     5) annual_inc< 159998 891 0 Normal (0.000000e+00 1.000000e+00) *
##   3) addr_state_OH< 0.5 28865 0 Normal (0.000000e+00 1.000000e+00) *
rpart.rules(local_tree$fit, extra = 4, cover = TRUE, clip.facs = TRUE, roundint = FALSE)
rpart.plot(local_tree$fit, roundint = FALSE, extra = 3)

anomaly_rules <- rpart.rules(local_tree$fit, extra = 4, cover = TRUE, clip.facs = TRUE) %>%
  clean_names() %>%
  filter(anomaly == "Anomaly") %>%
  mutate(rule = "IF") 

rule_cols <- anomaly_rules %>% dplyr::select(starts_with("x_")) %>% colnames()

for (col in rule_cols){
  anomaly_rules <- anomaly_rules %>%
    mutate(rule = paste(rule, !!as.name(col)))
}

as.data.frame(anomaly_rules) %>% dplyr::select(rule, cover)
local_explainer <- function(ID){
  fmla <- as.formula(paste("anomaly ~ ", paste(iso_prep %>% colnames(), collapse= "+")))
  
  pred_train %>%
    mutate(anomaly = as.factor(if_else(id==ID, "Anomaly", "Normal"))) -> local_df
  
  local_tree <- decision_tree(mode="classification",
                              tree_depth = 3,
                              min_n = 1,
                              cost_complexity = 0) %>%
    set_engine("rpart") %>%
    fit(fmla, local_df)
  
  local_tree$fit

  rpart.plot(local_tree$fit, roundint = FALSE, extra = 3) %>% print()
  
  anomaly_rules <- rpart.rules(local_tree$fit, extra = 4, cover = TRUE, clip.facs = TRUE) %>%
    clean_names() %>%
    filter(anomaly == "Anomaly") %>%
    mutate(rule = "IF")
  
  
  rule_cols <- anomaly_rules %>% dplyr::select(starts_with("x_")) %>% colnames()
  
  for (col in rule_cols){
    anomaly_rules <- anomaly_rules %>%
      mutate(rule = paste(rule, !!as.name(col)))
    }
  
  as.data.frame(anomaly_rules) %>%
    dplyr::select(rule, cover) %>%
    print()
  }

pred_train %>%
  slice_max(order_by = anomaly_score, n = 5) %>%
  pull(id) -> anomaly_vect

for (anomaly_id in anomaly_vect){
  print(anomaly_id)
  local_explainer(anomaly_id)
  }
## [1] 12024
## $obj
## n= 29777 
## 
## node), split, n, loss, yval, (yprob)
##       * denotes terminal node
## 
## 1) root 29777 1 Normal (3.358297e-05 9.999664e-01)  
##   2) pub_rec>=2.5 13 1 Normal (7.692308e-02 9.230769e-01)  
##     4) int_rate< 10.73 1 0 Anomaly (1.000000e+00 0.000000e+00) *
##     5) int_rate>=10.73 12 0 Normal (0.000000e+00 1.000000e+00) *
##   3) pub_rec< 2.5 29764 0 Normal (0.000000e+00 1.000000e+00) *
## 
## $snipped.nodes
## NULL
## 
## $xlim
## [1] -0.2  1.2
## 
## $ylim
## [1] -0.15  1.15
## 
## $x
## [1] 0.60738129 0.27793723 0.05830786 0.49756661 0.93682535
## 
## $y
## [1] 0.93276038 0.52435976 0.03427902 0.03427902 0.03427902
## 
## $branch.x
##        [,1]      [,2]       [,3]      [,4]      [,5]
## x 0.6073813 0.2779372 0.05830786 0.4975666 0.9368254
##          NA 0.2779372 0.05830786 0.4975666 0.9368254
##          NA 0.6073813 0.27793723 0.2779372 0.6073813
## 
## $branch.y
##      [,1]      [,2]      [,3]      [,4]      [,5]
## y 1.02051 0.6121098 0.1220290 0.1220290 0.1220290
##        NA 0.8335647 0.4251641 0.4251641 0.8335647
##        NA 0.8335647 0.4251641 0.4251641 0.8335647
## 
## $labs
## [1] "Normal\n1 / 29777" "Normal\n1 / 13"    "Anomaly\n0 / 1"   
## [4] "Normal\n0 / 12"    "Normal\n0 / 29764"
## 
## $cex
## [1] 1
## 
## $boxes
## $boxes$x1
## [1]  0.51073356  0.20097700 -0.03058419  0.42060637  0.84017762
## 
## $boxes$y1
## [1]  0.88316255  0.47476193 -0.01531881 -0.01531881 -0.01531881
## 
## $boxes$x2
## [1] 0.7040290 0.3548975 0.1471999 0.5745268 1.0334731
## 
## $boxes$y2
## [1] 1.0205104 0.6121098 0.1220290 0.1220290 0.1220290
## 
## 
## $split.labs
## [1] ""
## 
## $split.cex
## [1] 1 1 1 1 1
## 
## $split.box
## $split.box$x1
## [1] 0.4570404 0.1478804        NA        NA        NA
## 
## $split.box$y1
## [1] 0.7954125 0.3870119        NA        NA        NA
## 
## $split.box$x2
## [1] 0.7577222 0.4079941        NA        NA        NA
## 
## $split.box$y2
## [1] 0.8717169 0.4633163        NA        NA        NA
## Warning: Cannot retrieve the data used to build the model (so cannot determine roundint and is.binary for the variables).
## To silence this warning:
##     Call rpart.rules with roundint=FALSE,
##     or rebuild the rpart model with model=TRUE.

##                                 rule cover
## 4 IF pub_rec >= 2.5 & int_rate <  11    0%
## [1] 5259
## $obj
## n= 29777 
## 
## node), split, n, loss, yval, (yprob)
##       * denotes terminal node
## 
## 1) root 29777 1 Normal (3.358297e-05 9.999664e-01)  
##   2) last_pymnt_amnt>=34536.97 12 1 Normal (8.333333e-02 9.166667e-01)  
##     4) annual_inc>=446500 1 0 Anomaly (1.000000e+00 0.000000e+00) *
##     5) annual_inc< 446500 11 0 Normal (0.000000e+00 1.000000e+00) *
##   3) last_pymnt_amnt< 34536.97 29765 0 Normal (0.000000e+00 1.000000e+00) *
## 
## $snipped.nodes
## NULL
## 
## $xlim
## [1] -0.2  1.2
## 
## $ylim
## [1] -0.15  1.15
## 
## $x
## [1] 0.60738129 0.27793723 0.05830786 0.49756661 0.93682535
## 
## $y
## [1] 0.93276038 0.52435976 0.03427902 0.03427902 0.03427902
## 
## $branch.x
##        [,1]      [,2]       [,3]      [,4]      [,5]
## x 0.6073813 0.2779372 0.05830786 0.4975666 0.9368254
##          NA 0.2779372 0.05830786 0.4975666 0.9368254
##          NA 0.6073813 0.27793723 0.2779372 0.6073813
## 
## $branch.y
##      [,1]      [,2]      [,3]      [,4]      [,5]
## y 1.02051 0.6121098 0.1220290 0.1220290 0.1220290
##        NA 0.8335647 0.4251641 0.4251641 0.8335647
##        NA 0.8335647 0.4251641 0.4251641 0.8335647
## 
## $labs
## [1] "Normal\n1 / 29777" "Normal\n1 / 12"    "Anomaly\n0 / 1"   
## [4] "Normal\n0 / 11"    "Normal\n0 / 29765"
## 
## $cex
## [1] 1
## 
## $boxes
## $boxes$x1
## [1]  0.51073356  0.20097700 -0.03058419  0.42060637  0.84017762
## 
## $boxes$y1
## [1]  0.88316255  0.47476193 -0.01531881 -0.01531881 -0.01531881
## 
## $boxes$x2
## [1] 0.7040290 0.3548975 0.1471999 0.5745268 1.0334731
## 
## $boxes$y2
## [1] 1.0205104 0.6121098 0.1220290 0.1220290 0.1220290
## 
## 
## $split.labs
## [1] ""
## 
## $split.cex
## [1] 1 1 1 1 1
## 
## $split.box
## $split.box$x1
## [1] 0.34667104 0.06316449         NA         NA         NA
## 
## $split.box$y1
## [1] 0.7954125 0.3870119        NA        NA        NA
## 
## $split.box$x2
## [1] 0.8680915 0.4927100        NA        NA        NA
## 
## $split.box$y2
## [1] 0.8717169 0.4633163        NA        NA        NA
## Warning: Cannot retrieve the data used to build the model (so cannot determine roundint and is.binary for the variables).
## To silence this warning:
##     Call rpart.rules with roundint=FALSE,
##     or rebuild the rpart model with model=TRUE.

##                                                 rule cover
## 4 IF last_pymnt_amnt >= 34537 & annual_inc >= 446500    0%
## [1] 5822
## $obj
## n= 29777 
## 
## node), split, n, loss, yval, (yprob)
##       * denotes terminal node
## 
## 1) root 29777 1 Normal (3.358297e-05 9.999664e-01)  
##   2) int_rate>=22.415 69 1 Normal (1.449275e-02 9.855072e-01)  
##     4) dti< 1.29 1 0 Anomaly (1.000000e+00 0.000000e+00) *
##     5) dti>=1.29 68 0 Normal (0.000000e+00 1.000000e+00) *
##   3) int_rate< 22.415 29708 0 Normal (0.000000e+00 1.000000e+00) *
## 
## $snipped.nodes
## NULL
## 
## $xlim
## [1] -0.2  1.2
## 
## $ylim
## [1] -0.15  1.15
## 
## $x
## [1] 0.60738129 0.27793723 0.05830786 0.49756661 0.93682535
## 
## $y
## [1] 0.93276038 0.52435976 0.03427902 0.03427902 0.03427902
## 
## $branch.x
##        [,1]      [,2]       [,3]      [,4]      [,5]
## x 0.6073813 0.2779372 0.05830786 0.4975666 0.9368254
##          NA 0.2779372 0.05830786 0.4975666 0.9368254
##          NA 0.6073813 0.27793723 0.2779372 0.6073813
## 
## $branch.y
##      [,1]      [,2]      [,3]      [,4]      [,5]
## y 1.02051 0.6121098 0.1220290 0.1220290 0.1220290
##        NA 0.8335647 0.4251641 0.4251641 0.8335647
##        NA 0.8335647 0.4251641 0.4251641 0.8335647
## 
## $labs
## [1] "Normal\n1 / 29777" "Normal\n1 / 69"    "Anomaly\n0 / 1"   
## [4] "Normal\n0 / 68"    "Normal\n0 / 29708"
## 
## $cex
## [1] 1
## 
## $boxes
## $boxes$x1
## [1]  0.51073356  0.20097700 -0.03058419  0.42060637  0.84017762
## 
## $boxes$y1
## [1]  0.88316255  0.47476193 -0.01531881 -0.01531881 -0.01531881
## 
## $boxes$x2
## [1] 0.7040290 0.3548975 0.1471999 0.5745268 1.0334731
## 
## $boxes$y2
## [1] 1.0205104 0.6121098 0.1220290 0.1220290 0.1220290
## 
## 
## $split.labs
## [1] ""
## 
## $split.cex
## [1] 1 1 1 1 1
## 
## $split.box
## $split.box$x1
## [1] 0.4659892 0.1890452        NA        NA        NA
## 
## $split.box$y1
## [1] 0.7954125 0.3870119        NA        NA        NA
## 
## $split.box$x2
## [1] 0.7487733 0.3668293        NA        NA        NA
## 
## $split.box$y2
## [1] 0.8717169 0.4633163        NA        NA        NA
## Warning: Cannot retrieve the data used to build the model (so cannot determine roundint and is.binary for the variables).
## To silence this warning:
##     Call rpart.rules with roundint=FALSE,
##     or rebuild the rpart model with model=TRUE.

##                             rule cover
## 4 IF int_rate >= 22 & dti <  1.3    0%
## [1] 3880
## $obj
## n= 29777 
## 
## node), split, n, loss, yval, (yprob)
##       * denotes terminal node
## 
## 1) root 29777 1 Normal (3.358297e-05 9.999664e-01)  
##   2) out_prncp>=1127.87 120 1 Normal (8.333333e-03 9.916667e-01)  
##     4) out_prncp< 1131.13 1 0 Anomaly (1.000000e+00 0.000000e+00) *
##     5) out_prncp>=1131.13 119 0 Normal (0.000000e+00 1.000000e+00) *
##   3) out_prncp< 1127.87 29657 0 Normal (0.000000e+00 1.000000e+00) *
## 
## $snipped.nodes
## NULL
## 
## $xlim
## [1] -0.2  1.2
## 
## $ylim
## [1] -0.15  1.15
## 
## $x
## [1] 0.60738129 0.27793723 0.05830786 0.49756661 0.93682535
## 
## $y
## [1] 0.93276038 0.52435976 0.03427902 0.03427902 0.03427902
## 
## $branch.x
##        [,1]      [,2]       [,3]      [,4]      [,5]
## x 0.6073813 0.2779372 0.05830786 0.4975666 0.9368254
##          NA 0.2779372 0.05830786 0.4975666 0.9368254
##          NA 0.6073813 0.27793723 0.2779372 0.6073813
## 
## $branch.y
##      [,1]      [,2]      [,3]      [,4]      [,5]
## y 1.02051 0.6121098 0.1220290 0.1220290 0.1220290
##        NA 0.8335647 0.4251641 0.4251641 0.8335647
##        NA 0.8335647 0.4251641 0.4251641 0.8335647
## 
## $labs
## [1] "Normal\n1 / 29777" "Normal\n1 / 120"   "Anomaly\n0 / 1"   
## [4] "Normal\n0 / 119"   "Normal\n0 / 29657"
## 
## $cex
## [1] 1
## 
## $boxes
## $boxes$x1
## [1]  0.51073356  0.20097700 -0.03058419  0.42060637  0.84017762
## 
## $boxes$y1
## [1]  0.88316255  0.47476193 -0.01531881 -0.01531881 -0.01531881
## 
## $boxes$x2
## [1] 0.7040290 0.3548975 0.1471999 0.5745268 1.0334731
## 
## $boxes$y2
## [1] 1.0205104 0.6121098 0.1220290 0.1220290 0.1220290
## 
## 
## $split.labs
## [1] ""
## 
## $split.cex
## [1] 1 1 1 1 1
## 
## $split.box
## $split.box$x1
## [1] 0.4224381 0.1043293        NA        NA        NA
## 
## $split.box$y1
## [1] 0.7954125 0.3870119        NA        NA        NA
## 
## $split.box$x2
## [1] 0.7923245 0.4515452        NA        NA        NA
## 
## $split.box$y2
## [1] 0.8717169 0.4633163        NA        NA        NA
## Warning: Cannot retrieve the data used to build the model (so cannot determine roundint and is.binary for the variables).
## To silence this warning:
##     Call rpart.rules with roundint=FALSE,
##     or rebuild the rpart model with model=TRUE.

##                           rule cover
## 4 IF out_prncp is 1128 to 1131    0%
## [1] 28010
## $obj
## n= 29777 
## 
## node), split, n, loss, yval, (yprob)
##       * denotes terminal node
## 
## 1) root 29777 1 Normal (3.358297e-05 9.999664e-01)  
##   2) sub_grade_E5>=0.5 356 1 Normal (2.808989e-03 9.971910e-01)  
##     4) emp_length_X9.years>=0.5 12 1 Normal (8.333333e-02 9.166667e-01)  
##       8) inq_last_6mths>=3.5 1 0 Anomaly (1.000000e+00 0.000000e+00) *
##       9) inq_last_6mths< 3.5 11 0 Normal (0.000000e+00 1.000000e+00) *
##     5) emp_length_X9.years< 0.5 344 0 Normal (0.000000e+00 1.000000e+00) *
##   3) sub_grade_E5< 0.5 29421 0 Normal (0.000000e+00 1.000000e+00) *
## 
## $snipped.nodes
## NULL
## 
## $xlim
## [1] 0 1
## 
## $ylim
## [1] 0 1
## 
## $x
## [1] 0.68059108 0.42435682 0.20472744 0.05830786 0.35114702 0.64398619 0.93682535
## 
## $y
## [1] 0.93276038 0.66049330 0.38822622 0.03427902 0.03427902 0.03427902 0.03427902
## 
## $branch.x
##        [,1]      [,2]      [,3]       [,4]      [,5]      [,6]      [,7]
## x 0.6805911 0.4243568 0.2047274 0.05830786 0.3511470 0.6439862 0.9368254
##          NA 0.4243568 0.2047274 0.05830786 0.3511470 0.6439862 0.9368254
##          NA 0.6805911 0.4243568 0.20472744 0.2047274 0.4243568 0.6805911
## 
## $branch.y
##      [,1]      [,2]      [,3]      [,4]      [,5]      [,6]     [,7]
## y 1.00026 0.7279933 0.4557262 0.1017790 0.1017790 0.1017790 0.101779
##        NA 0.8564560 0.5841889 0.3119219 0.3119219 0.5841889 0.856456
##        NA 0.8564560 0.5841889 0.3119219 0.3119219 0.5841889 0.856456
## 
## $labs
## [1] "Normal\n1 / 29777" "Normal\n1 / 356"   "Normal\n1 / 12"   
## [4] "Anomaly\n0 / 1"    "Normal\n0 / 11"    "Normal\n0 / 344"  
## [7] "Normal\n0 / 29421"
## 
## $cex
## [1] 1
## 
## $boxes
## $boxes$x1
## [1]  0.611556988  0.369385220  0.149755847 -0.005186464  0.296175429
## [6]  0.589014593  0.867791256
## 
## $boxes$y1
## [1]  0.894608204  0.622341125  0.350074046 -0.003873157 -0.003873157
## [6] -0.003873157 -0.003873157
## 
## $boxes$x2
## [1] 0.7496252 0.4793284 0.2596990 0.1218022 0.4061186 0.6989578 1.0058594
## 
## $boxes$y2
## [1] 1.0002604 0.7279933 0.4557262 0.1017790 0.1017790 0.1017790 0.1017790
## 
## 
## $split.labs
## [1] ""
## 
## $split.cex
## [1] 1 1 1 1 1 1 1
## 
## $split.box
## $split.box$x1
## [1] 0.53314789 0.22961248 0.05003993         NA         NA         NA         NA
## 
## $split.box$y1
## [1] 0.8271082 0.5548411 0.2825740        NA        NA        NA        NA
## 
## $split.box$x2
## [1] 0.8280343 0.6191011 0.3594150        NA        NA        NA        NA
## 
## $split.box$y2
## [1] 0.8858039 0.6135368 0.3412697        NA        NA        NA        NA
## Warning: Cannot retrieve the data used to build the model (so cannot determine roundint and is.binary for the variables).
## To silence this warning:
##     Call rpart.rules with roundint=FALSE,
##     or rebuild the rpart model with model=TRUE.

##                                                                          rule
## 8 IF sub_grade_E5 >= 0.5 & emp_length_X9.years >= 0.5 & inq_last_6mths >= 3.5
##   cover
## 8    0%
## [1] 29658
## $obj
## n= 29777 
## 
## node), split, n, loss, yval, (yprob)
##       * denotes terminal node
## 
## 1) root 29777 1 Normal (3.358297e-05 9.999664e-01)  
##   2) total_rec_late_fee>=87.02027 37 1 Normal (2.702703e-02 9.729730e-01)  
##     4) pub_rec>=0.5 1 0 Anomaly (1.000000e+00 0.000000e+00) *
##     5) pub_rec< 0.5 36 0 Normal (0.000000e+00 1.000000e+00) *
##   3) total_rec_late_fee< 87.02027 29740 0 Normal (0.000000e+00 1.000000e+00) *
## 
## $snipped.nodes
## NULL
## 
## $xlim
## [1] -0.2  1.2
## 
## $ylim
## [1] -0.15  1.15
## 
## $x
## [1] 0.60738129 0.27793723 0.05830786 0.49756661 0.93682535
## 
## $y
## [1] 0.93276038 0.52435976 0.03427902 0.03427902 0.03427902
## 
## $branch.x
##        [,1]      [,2]       [,3]      [,4]      [,5]
## x 0.6073813 0.2779372 0.05830786 0.4975666 0.9368254
##          NA 0.2779372 0.05830786 0.4975666 0.9368254
##          NA 0.6073813 0.27793723 0.2779372 0.6073813
## 
## $branch.y
##      [,1]      [,2]      [,3]      [,4]      [,5]
## y 1.02051 0.6121098 0.1220290 0.1220290 0.1220290
##        NA 0.8335647 0.4251641 0.4251641 0.8335647
##        NA 0.8335647 0.4251641 0.4251641 0.8335647
## 
## $labs
## [1] "Normal\n1 / 29777" "Normal\n1 / 37"    "Anomaly\n0 / 1"   
## [4] "Normal\n0 / 36"    "Normal\n0 / 29740"
## 
## $cex
## [1] 1
## 
## $boxes
## $boxes$x1
## [1]  0.51073356  0.20097700 -0.03058419  0.42060637  0.84017762
## 
## $boxes$y1
## [1]  0.88316255  0.47476193 -0.01531881 -0.01531881 -0.01531881
## 
## $boxes$x2
## [1] 0.7040290 0.3548975 0.1471999 0.5745268 1.0334731
## 
## $boxes$y2
## [1] 1.0205104 0.6121098 0.1220290 0.1220290 0.1220290
## 
## 
## $split.labs
## [1] ""
## 
## $split.cex
## [1] 1 1 1 1 1
## 
## $split.box
## $split.box$x1
## [1] 0.3717279 0.1275963        NA        NA        NA
## 
## $split.box$y1
## [1] 0.7954125 0.3870119        NA        NA        NA
## 
## $split.box$x2
## [1] 0.8430347 0.4282782        NA        NA        NA
## 
## $split.box$y2
## [1] 0.8717169 0.4633163        NA        NA        NA
## Warning: Cannot retrieve the data used to build the model (so cannot determine roundint and is.binary for the variables).
## To silence this warning:
##     Call rpart.rules with roundint=FALSE,
##     or rebuild the rpart model with model=TRUE.

##                                           rule cover
## 4 IF total_rec_late_fee >= 87 & pub_rec >= 0.5    0%